Seurat Weekly NO.3||定制可视化
天子呼来不上船,
自称臣是菜鸟团
在这里,和国际同行一起学习单细胞数据分析。
很多CNS文章的fig1(寓指tsne/umap图谱),打眼一看就是Seurat出的。Seurat 确实用ggplot2 包装了大量的可视化函数。
但是在文章的后面,特别是用到统计信息的时候,Seurat的默认参数就不能直接用了。这就到了考验基本功的时候,有的需要朝里看源码,有的需要朝外包装Seurat。
本期的Seurat Weekly
,你们家运来小哥哥就带大家看看扩展Seurat可视化函数的思路。每个图都反映了一种特定的思路,我们来一一解说。
载入R包和我们的老朋友pbmc3k.final
。
library(Seurat)
library(SeuratData)
library(ggforce)
library(ggpubr)
pbmc3k.final
An object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features)
2 dimensional reductions calculated: pca, umap
第零种,教程找的勤
找教程确实可以解决大部分问题。如两样本的差异基因可视化:
library(ggplot2)
library(cowplot)
theme_set(theme_cowplot())
pbmc3k.final$stim <- sample(c("rep1", "rep2"), size = ncol(pbmc3k.final), replace = TRUE)
Idents(pbmc3k.final)
t.cells <- subset(pbmc3k.final, idents = "Naive CD4 T")
Idents(t.cells) <- "stim"
avg.t.cells <- log1p(AverageExpression(t.cells, verbose = FALSE)$RNA)
avg.t.cells$gene <- rownames(avg.t.cells)
cd14.mono <- subset(pbmc3k.final, idents = "CD14+ Mono")
Idents(cd14.mono) <- "stim"
avg.cd14.mono <- log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA)
avg.cd14.mono$gene <- rownames(avg.cd14.mono)
genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8")
p1 <- ggplot(avg.t.cells, aes(rep1, rep2)) + geom_point() + ggtitle("CD4 Naive T Cells")
p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE)+ theme_bw()
p2 <- ggplot(avg.cd14.mono, aes(rep1, rep2)) + geom_point() + ggtitle("CD14 Monocytes")
p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE) +
geom_smooth() +
geom_abline(intercept=1,slope=1 ,colour="#990000", linetype="dashed" )+
geom_abline(intercept=-1,slope=1 ,colour="#990000", linetype="dashed" )+ theme_bw()
plot_grid(p1, p2)
第一种:改源码
以DoHeatmap
为例,我们知道Seurat的热图可以加一条idents的bar分组信息,但是没有给参数来加更多的信息。如我们的细胞既有临床分组信息,又有细胞类型,又有细胞周期,我们该怎么办?
为了多一些metadata信息,我们计算一个细胞周期吧。其实这里更多的应该是你的临床样本信息,这样才有意思。
pbmc3k.final <- CellCycleScoring(
object = pbmc3k.final,
g2m.features = cc.genes$g2m.genes,
s.features = cc.genes$s.genes
)
head(pbmc3k.final@meta.data) ## 有了,有了
orig.ident nCount_RNA nFeature_RNA seurat_annotations percent.mt
AAACATACAACCAC pbmc3k 2419 779 Memory CD4 T 3.0177759
AAACATTGAGCTAC pbmc3k 4903 1352 B 3.7935958
AAACATTGATCAGC pbmc3k 3147 1129 Memory CD4 T 0.8897363
AAACCGTGCTTCCG pbmc3k 2639 960 CD14+ Mono 1.7430845
AAACCGTGTATGCG pbmc3k 980 521 NK 1.2244898
AAACGCACTGGTAC pbmc3k 2163 781 Memory CD4 T 1.6643551
RNA_snn_res.0.5 seurat_clusters S.Score G2M.Score Phase
AAACATACAACCAC 1 1 0.10502807 -0.04507596 S
AAACATTGAGCTAC 3 3 -0.02680010 -0.04986215 G1
AAACATTGATCAGC 1 1 -0.01387173 0.07792135 G2M
AAACCGTGCTTCCG 2 2 0.04595348 0.01140204 S
AAACCGTGTATGCG 6 6 -0.02602771 0.04413069 G2M
AAACGCACTGGTAC 1 1 -0.03692587 -0.08341568 G1
为了热图的基因有顺序,我们还是计算一个差异基因吧,使热图有框。
mhgen <- FindAllMarkers(pbmc3k.final,only.pos = T)
head(mhgen)
p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
RPS12 2.008629e-140 0.5029988 1.000 0.991 2.754633e-136 Naive CD4 T RPS12
RPS27 2.624075e-140 0.5020359 0.999 0.992 3.598656e-136 Naive CD4 T RPS27
RPS6 1.280169e-138 0.4673635 1.000 0.995 1.755623e-134 Naive CD4 T RPS6
RPL32 4.358823e-135 0.4242773 0.999 0.995 5.977689e-131 Naive CD4 T RPL32
RPS14 3.618793e-128 0.4283480 1.000 0.994 4.962812e-124 Naive CD4 T RPS14
RPS25 5.612007e-124 0.5203411 0.997 0.975 7.696306e-120 Naive CD4 T RPS25
好了我们可以绘制热图上面的bar了:
# Show we can color anything
cols.use <- list(Phase=c('red', 'blue', 'pink', 'brown')) # 指定颜色
pbmc3k.final <-ScaleData(pbmc3k.final,features= mk) # 为了我的差异基因都被Scale过
#png('mult.png')
DoMultiBarHeatmap(pbmc3k.final, features=mk, group.by='Phase', additional.group.by = c('seurat_clusters', 'seurat_annotations'), additional.group.sort.by = c('seurat_annotations'), cols.use=cols.use)
# dev.off()
大概这样子啦,想要吗?
第二种,封装它
其实这个方法我们之前见过,Seurat绘制堆积小提琴图用的就是这个。只堆积个小提琴图满足不了审稿人日益增长的需求啊,于是我们想在绘制表达量小提琴图的时候,加上统计信息。这不免让我们想起ggpubr
。指定一个基因集和需要作比较的分组信息,我这里是细胞类型,你可以是你的实验设计信息啊。
gene_sig <- c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")
comparisons <- list(c("Naive CD4 T", "CD8 T"))
Vlnpubr(pbmc3k.final,gene_signature = gene_sig, test_sign = comparisons,group_my = 'seurat_annotations')
第三种,等更新
既不会看源码,也不熟悉ggplot的世界,不会封装。这时候可以选择等Seurat团队把我们的想法实现之后再作图。这个代价有点大,单细胞数据贬值的速度可是正比于其火热的程度啊。
按照细胞类型分组绘制的DotPlot
,就是由于需求太过强烈,作者在V3.2中实现了。
packageVersion("Seurat") # 快看看你用的是哪个版本吧。[1] ‘3.2.2’
Idents(pbmc3k.final) <- as.vector(pbmc3k.final[['seurat_annotations',drop=T]])
# 直接指定某一列作为 Idents
Idents(pbmc3k, which(is.na(Idents(pbmc3k)))) <- "Unknown"
Bcellmarks <- c("CD19","CD79A","CD79B","MS4A1")
HLAmarks <- c("HLA-DQA1", "HLA-DQB1", "HLA-DRB3")
Monomarks <- c("CD14","VCAN","FCN1")
Tcellmarks <- c("CD3D","CD3E","CD3G","IL7R","TRAC","TRGC2","TRDC", "CD8A", "CD8B", "CD4")
DCmarks <- c(HLAmarks,"CLEC10A","CLEC9A")
platelet <- c("GP9","PF4")
features <- list("B cell" = Bcellmarks, "T cell" = Tcellmarks, "DC" = DCmarks, "Monocyte" = Monomarks, "Platelet" = platelet)
DotPlot(object = pbmc3k.final, features=features, cluster.idents=T) + theme(axis.text.x = element_text(angle = 90))
等更新还是封装它,还是改源码,这不就是现实扭曲场力吗?
Source Code
DoMultiBarHeatmap
suppressPackageStartupMessages({
library(rlang)
})
DoMultiBarHeatmap <- function (object,
features = NULL,
cells = NULL,
group.by = "ident",
additional.group.by = NULL,
additional.group.sort.by = NULL,
cols.use = NULL,
group.bar = TRUE,
disp.min = -2.5,
disp.max = NULL,
slot = "scale.data",
assay = NULL,
label = TRUE,
size = 5.5,
hjust = 0,
angle = 45,
raster = TRUE,
draw.lines = TRUE,
lines.width = NULL,
group.bar.height = 0.02,
combine = TRUE)
{
cells <- cells %||% colnames(x = object)
if (is.numeric(x = cells)) {
cells <- colnames(x = object)[cells]
}
assay <- assay %||% DefaultAssay(object = object)
DefaultAssay(object = object) <- assay
features <- features %||% VariableFeatures(object = object)
## Why reverse???
features <- rev(x = unique(x = features))
disp.max <- disp.max %||% ifelse(test = slot == "scale.data",
yes = 2.5, no = 6)
possible.features <- rownames(x = GetAssayData(object = object,
slot = slot))
if (any(!features %in% possible.features)) {
bad.features <- features[!features %in% possible.features]
features <- features[features %in% possible.features]
if (length(x = features) == 0) {
stop("No requested features found in the ", slot,
" slot for the ", assay, " assay.")
}
warning("The following features were omitted as they were not found in the ",
slot, " slot for the ", assay, " assay: ", paste(bad.features,
collapse = ", "))
}
if (!is.null(additional.group.sort.by)) {
if (any(!additional.group.sort.by %in% additional.group.by)) {
bad.sorts <- additional.group.sort.by[!additional.group.sort.by %in% additional.group.by]
additional.group.sort.by <- additional.group.sort.by[additional.group.sort.by %in% additional.group.by]
if (length(x = bad.sorts) > 0) {
warning("The following additional sorts were omitted as they were not a subset of additional.group.by : ",
paste(bad.sorts, collapse = ", "))
}
}
}
data <- as.data.frame(x = as.matrix(x = t(x = GetAssayData(object = object,
slot = slot)[features, cells, drop = FALSE])))
object <- suppressMessages(expr = StashIdent(object = object,
save.name = "ident"))
group.by <- group.by %||% "ident"
groups.use <- object[[c(group.by, additional.group.by[!additional.group.by %in% group.by])]][cells, , drop = FALSE]
plots <- list()
for (i in group.by) {
data.group <- data
if (!is_null(additional.group.by)) {
additional.group.use <- additional.group.by[additional.group.by!=i]
if (!is_null(additional.group.sort.by)){
additional.sort.use = additional.group.sort.by[additional.group.sort.by != i]
} else {
additional.sort.use = NULL
}
} else {
additional.group.use = NULL
additional.sort.use = NULL
}
group.use <- groups.use[, c(i, additional.group.use), drop = FALSE]
for(colname in colnames(group.use)){
if (!is.factor(x = group.use[[colname]])) {
group.use[[colname]] <- factor(x = group.use[[colname]])
}
}
if (draw.lines) {
lines.width <- lines.width %||% ceiling(x = nrow(x = data.group) *
0.0025)
placeholder.cells <- sapply(X = 1:(length(x = levels(x = group.use[[i]])) *
lines.width), FUN = function(x) {
return(Seurat:::RandomName(length = 20))
})
placeholder.groups <- data.frame(rep(x = levels(x = group.use[[i]]), times = lines.width))
group.levels <- list()
group.levels[[i]] = levels(x = group.use[[i]])
for (j in additional.group.use) {
group.levels[[j]] <- levels(x = group.use[[j]])
placeholder.groups[[j]] = NA
}
colnames(placeholder.groups) <- colnames(group.use)
rownames(placeholder.groups) <- placeholder.cells
group.use <- sapply(group.use, as.vector)
rownames(x = group.use) <- cells
group.use <- rbind(group.use, placeholder.groups)
for (j in names(group.levels)) {
group.use[[j]] <- factor(x = group.use[[j]], levels = group.levels[[j]])
}
na.data.group <- matrix(data = NA, nrow = length(x = placeholder.cells),
ncol = ncol(x = data.group), dimnames = list(placeholder.cells,
colnames(x = data.group)))
data.group <- rbind(data.group, na.data.group)
}
order_expr <- paste0('order(', paste(c(i, additional.sort.use), collapse=','), ')')
group.use = with(group.use, group.use[eval(parse(text=order_expr)), , drop=F])
plot <- Seurat:::SingleRasterMap(data = data.group, raster = raster,
disp.min = disp.min, disp.max = disp.max, feature.order = features,
cell.order = rownames(x = group.use), group.by = group.use[[i]])
if (group.bar) {
pbuild <- ggplot_build(plot = plot)
group.use2 <- group.use
cols <- list()
na.group <- Seurat:::RandomName(length = 20)
for (colname in rev(x = colnames(group.use2))) {
if (colname == i) {
colid = paste0('Identity (', colname, ')')
} else {
colid = colname
}
# Default
cols[[colname]] <- c(scales::hue_pal()(length(x = levels(x = group.use[[colname]]))))
#Overwrite if better value is provided
if (!is_null(cols.use[[colname]])) {
req_length = length(x = levels(group.use))
if (length(cols.use[[colname]]) < req_length){
warning("Cannot use provided colors for ", colname, " since there aren't enough colors.")
} else {
if (!is_null(names(cols.use[[colname]]))) {
if (all(levels(group.use[[colname]]) %in% names(cols.use[[colname]]))) {
cols[[colname]] <- as.vector(cols.use[[colname]][levels(group.use[[colname]])])
} else {
warning("Cannot use provided colors for ", colname, " since all levels (", paste(levels(group.use[[colname]]), collapse=","), ") are not represented.")
}
} else {
cols[[colname]] <- as.vector(cols.use[[colname]])[c(1:length(x = levels(x = group.use[[colname]])))]
}
}
}
# Add white if there's lines
if (draw.lines) {
levels(x = group.use2[[colname]]) <- c(levels(x = group.use2[[colname]]), na.group)
group.use2[placeholder.cells, colname] <- na.group
cols[[colname]] <- c(cols[[colname]], "#FFFFFF")
}
names(x = cols[[colname]]) <- levels(x = group.use2[[colname]])
y.range <- diff(x = pbuild$layout$panel_params[[1]]$y.range)
y.pos <- max(pbuild$layout$panel_params[[1]]$y.range) + y.range * 0.015
y.max <- y.pos + group.bar.height * y.range
pbuild$layout$panel_params[[1]]$y.range <- c(pbuild$layout$panel_params[[1]]$y.range[1], y.max)
plot <- suppressMessages(plot +
annotation_raster(raster = t(x = cols[[colname]][group.use2[[colname]]]), xmin = -Inf, xmax = Inf, ymin = y.pos, ymax = y.max) +
annotation_custom(grob = grid::textGrob(label = colid, hjust = 0, gp = gpar(cex = 0.75)), ymin = mean(c(y.pos, y.max)), ymax = mean(c(y.pos, y.max)), xmin = Inf, xmax = Inf) +
coord_cartesian(ylim = c(0, y.max), clip = "off"))
# same time run or not ggplot version ?
if ((colname == i ) && label) {
x.max <- max(pbuild$layout$panel_params[[1]]$x.range)
x.divs <- pbuild$layout$panel_params[[1]]$x.major
group.use$x <- x.divs
label.x.pos <- tapply(X = group.use$x, INDEX = group.use[[colname]],
FUN = median) * x.max
label.x.pos <- data.frame(group = names(x = label.x.pos),
label.x.pos)
plot <- plot + geom_text(stat = "identity",
data = label.x.pos, aes_string(label = "group",
x = "label.x.pos"), y = y.max + y.max *
0.03 * 0.5, angle = angle, hjust = hjust,
size = size)
plot <- suppressMessages(plot + coord_cartesian(ylim = c(0,
y.max + y.max * 0.002 * max(nchar(x = levels(x = group.use[[colname]]))) *
size), clip = "off"))
}
}
}
plot <- plot + theme(line = element_blank())
plots[[i]] <- plot
}
if (combine) {
plots <- CombinePlots(plots = plots)
}
return(plots)
}
Vlnpubr
Vlnpubr <- function(seo, gene_signature, file_name, test_sign,group_my,label = "p.signif"){
plot_case1 <- function(signature, y_max = NULL){
VlnPlot(seo , features = signature,
pt.size = 0.1,
group.by = group_my,
y.max = y_max # add the y-axis maximum value - otherwise p-value hidden
) + stat_compare_means(comparisons = test_sign, label = label ) + NoLegend()
}
plot_list <- list()
y_max_list <- list()
for (gene in gene_signature) {
plot_list[[gene]] <- plot_case1(gene)
y_max_list[[gene]] <- max(plot_list[[gene]]$data[[gene]]) # get the max no. for each gene
plot_list[[gene]] <- plot_case1(gene, y_max = (y_max_list[[gene]] + 1) )
}
cowplot::plot_grid(plotlist = plot_list)
#file_name <- paste0(file_name, "_r.png")
#ggsave(file_name, width = 14, height = 8)
}
参考:
https://github.com/satijalab/seurat/issues/2201
https://www.biostars.org/p/458261/